/*==============================================================================
FILE NAME: Process_Enforcements.do
INPUTS: enforcements.dta
OUTPUTS: Enforcements_Clean.dta
UPDATED: 16 June 2022
==============================================================================*/

/* Set directory if working independently through code */
if c(username)=="" { //insert username
    global rootdir "" // insert root path
    global processed_data "$rootdir/processed_data"  // Define global paths for replication package
} 

// Prevent Stata from pausing output
set more off

// Load raw enforcement data
use "$processed_data/enforcement.dta", clear

// Drop unnecessary columns
drop Address Address2 State PhysicalLocation N P InvestigationStatusDate ViolationCode ViolationStatus ViolationStatusDate ViolationResolution ViolationAllegation ALLEGATION_TXT ActivityCode

// Rename key columns for clarity
rename RegulatedEntityNo RN  
rename CustomerNo CN
rename InvestigationNo IN
rename ViolationTrackNo VN
rename O ZipCode_alt

// Convert zip code columns to numeric
destring ZipCode ZipCode_alt, replace

// Flag and fill missing zip codes using alternate column
gen ZipFlag_EN = 1 if ZipCode == . & ZipCode_alt != .
replace ZipCode = ZipCode_alt if ZipCode == . & ZipCode_alt != .
drop ZipCode_alt ZipCodeExt

// Restrict zip codes to valid Texas range
replace ZipCode = . if ZipCode < 73301 
replace ZipCode = . if ZipCode > 88589

// Fill missing city values using NearestCity
replace City = NearestCity if City == "" & NearestCity != ""

// Drop NearestCity column
drop NearestCity

// Rename region, city, zip, and county columns for enforcement events
foreach x in TCEQRegion City ZipCode County {
    rename `x' `x'_EN
}

// Remove duplicate rows
duplicates drop 

// Create enforcement indicator
gen enforcement = 1 

// Create media-specific enforcement indicators
gen enforcement_air = 1 if Media == "AIR"
replace enforcement_air = 0 if Media == "WATER" | Media == "WASTE"
gen enforcement_water = 1 if Media == "WATER"
replace enforcement_water = 0 if Media == "AIR" | Media == "WASTE"
gen enforcement_waste = 1 if Media == "WASTE"
replace enforcement_waste = 0 if Media == "AIR" | Media == "WATER"

// Rename notice date column for clarity
rename NoticeDate EN_Date /*all NoticeTypes = NOE so Notice is Notice of Enforcement not Notice of Violation */

// Count matches and mismatches between enforcement date and investigation start date for AIR media
count if EN_Date==InvestigationStartDate & Media=="AIR"
count if EN_Date!=InvestigationStartDate & Media=="AIR"

// Drop media column
drop Media

// Extract day, month, and year from enforcement date
gen day = day(EN_Date)
gen month = month(EN_Date)
gen year = year(EN_Date)

// Save cleaned enforcement data
save "$processed_data/Enforcements_Clean.dta", replace